import json
import tqdm

src_file = "data/raw_data.jsonl"
dst_file = "data/std_data.jsonl"


with open(src_file, "r", encoding="utf-8") as infile, open(dst_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        raw_json = json.loads(line)

        std_format = {
            "messages": [
                {
                    "role": "user",
                    "content": raw_json["instruction"]
                },
                {
                    "role": "assistant",
                    "content": raw_json["output"]
                }
            ]
        }

        json.dump(std_format, outfile, ensure_ascii=False)
        outfile.write("\n")     # 每行之后添加换行符